In [2]:
from __future__ import division
import random
In [4]:
def split_data(data, prob):
"""splits the data into fractions [prob, 1-prob]"""
results = [], []
for row in data:
results[0 if random.random() < prob else 1].append(row)
return results
def train_test_split(x, y, test_pct):
data = zip(x,y)
train, test = split_data(data, 1- test_pct)
# unzip trick
x_train, y_train = zip(*train)
x_test, y_test = zip(*test)
return x_train, x_test, y_train, y_test
In [8]:
def accuracy(tp, fp, fn, tn):
correct = tp + tn
total = tp + fp + fn + tn
return correct / total
def precision(tp, fp, fn, tn):
return tp / (tp + fp)
def recall(tp, fp, fn, tn):
return tp / (tp + fn)
def f1_scoare(tp, fp, fn, tn):
p = precision(tp, fp, fn, tn)
r = recall(tp, fp, fn, tn)
return 2 * p * r / (p + r)
In [9]:
# bias is poor performance even on training
# variance is poor performance across many different trainings
# bias can be solved by adding features
# varaince can be solved by removing features or getting data
In [ ]: